#### Replication code for structural topic models in Holland and Rios, CPS, 2021

setwd("/Users/bradleyholland/Dropbox/STM_Search_Results")

rm(list = ls())

# Load packages
library(stm)
library(readstata13)
library(dplyr)
library(grid)

#LOAD DATA
load("Mex_DATA.rds")
data2 <- read.dta13("Mex_FinalData.dta")
data <- cbind(data, data2)


#Create binary variables
# data$edu_cat <- as.factor(ifelse(data$educ == 4, 1, 0))
# data$act_cat <- as.factor(ifelse(data$code_type == 1 | data$activity == "MinerÃ­a", 1, 0))

data$vio_tile <- ntile(data$tasahom2017, 3)
data$hom17_cat <- as.factor(ifelse(data$vio_tile == 3, 1, 0))
data$employ_cat <- as.factor(ifelse(data$code_employ <= 2, 1, 0))
data$export_cat <- as.factor(data$export)
 data$GROUP <- ifelse(data$TREAT_VIO != "", "VIO", ifelse(data$TREAT_CORR != "", "CORR", ifelse(data$CONTROL_POST != "", "POST", "")))
data$TREATMENT <- ifelse(data$TREAT_VIO != "", 1, 0)
data$documents <- paste(data$TREAT_VIO, data$TREAT_CORR, data$CONTROL_POST)

#Prep text data 
processed <- textProcessor(data$documents, metadata =data)
#out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta, upper.thresh = 250)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
docs <- out$documents
vocab <- out$vocab
meta  <-out$meta



#### Figure 5: Examples of concerns that focus on socioeconomic issues

# run 200 models and returns ones that do perform well (uncomment line below to run from scratch): 
# sresult10 <- selectModel(docs,vocab,10, prevalence=~GROUP,data=meta, runs=200)

# load saved results from previous line:
load("stm_search_results_K10_200runs_111518.RData")

# subjectively select a model in which topics and representative texts are relatively coherent: 
prev_mod<- sresult10$runout[[20]]

# get 50 most representative responses of topic 5. Subjectively select two that illustrate relevant dynamics of theory:
thoughts5 <- findThoughts(prev_mod, texts = data$documents[1:1633],  n = 50, topics = 5)
thoughts5

# translate and plot the illustrative texts:
sentence_vec_topic5 = c("[violence affects] the social environment, the security of the population, and this decreases economic activity.", "Because there aren't sufficient job offerings, fathers and new generations look toward the options that organized crime offers.")
dev.new(width = 5, height = 5)
plotQuote(sentence_vec_topic5, width = 50, main = "Representative Responses from Topic 5")
dev.off()

#### Figure 4: Difference in topic prevalence between control and treatment groups

# run prevalence model:. 
prep <- estimateEffect(1:10 ~ GROUP, prev_mod,  meta = out$meta, uncertainty = "Global")

#get topic words
labelTopics(prev_mod, n = 10)

# plot the results: 
dev.new(width = 11, height = 5)
par(oma=c(0,16,0,0))
plot(prep, covariate = "GROUP", topics = 1:10,  model = prev_mod, method = "difference", cov.value1 = "VIO", cov.value2 = "POST", width = 500, custom.labels = c("Topic 1: lado, valor, manera, exist, trabajo", "Topic 2: postal, correo, documento, envío, mensajería", "Topic 3: secuestro, inversion, violencia, temor, patrimonio", "Topic 4: empleado, encarec, trabajar, producto, producción", "Topic 5: paí, dinero, desigualdad, pobr, clase", "Topic 6: corrupto, gobierno, funcionario, ley, estat", "Topic 7: avanzar, oportunidad, operar, confianza, realizar", "Topic 8: económica, cierr, perdida, disminución, invers", "Topic 9: desarrollo, económico, creación, gasto, falta", "Topic 10: autoridad, ciudadano, proyecto, economico, cuota"), labeltype = "custom", xlab = "Difference in Topic Proportion (treatment-control)", oma = 100)
dev.off()


#### Figure 6: Respondents relying exclusively on domestic markets discuss concerns differently

# select model:
#sresult10__export_cont <- selectModel(docs,vocab,10, prevalence=~GROUP,data=meta, content = ~ export_cat, runs=200)

# load results from line above:
load("stm_search_results_K10_export_cont_200runs_112218.RData")

# subjectively select a coherent model: 
test_mod<- sresult10__export_cont$runout[[9]]

# get topic words:
labelTopics(test_mod, n = 10, frexweight = 0.2)

# get representative texts:
thoughts <- findThoughts(test_mod, texts = data$documents[1:1633],  n = 10, topics = 6)
thoughts

#translate and plot topic words and two exemplar texts: 
sentence_vec_topic6 = c("Topic words: economy, activity, start up, kidnapping, develoment, money, fear, workers, investment, unrest ", "Exemplar 1: [Violence] inhibits investments, leads to spending on private security, and reduces the conditions that facilitate prosperity.","Exemplar 2: The loss of family members or people (through kidnapping), of wealth (by extortion and robbery), and of business (for the lack of regular customers in business centers).")
dev.new(width = 6, height = 6)
plotQuote(sentence_vec_topic6, width = 60)
dev.off

#plot differences in word choice based on export activity: 
dev.new(width = 5, height = 5)
plot(test_mod, n = 24, type = "perspectives", topics = 6, plabels = c("No Export", "Export"), text.cex = .5)
dev.off()


#### Figure 7: Respondents working in low- and high-violence municipalities discuss concerns differently

# select model:
#sresult10__hom_cont <- selectModel(docs,vocab,10, prevalence=~GROUP,data=meta, content = ~ hom17_cat, runs=200)

# load results from line above:
load("stm_search_results_K10_hom_cont_200runs_111918.RData")

# subjectively select a coherent model: 
test_mod<- sresult10__hom_cont$runout[[4]]

# get topic words:
labelTopics(test_mod, n=12)

# get representative texts:
thoughts1 <- findThoughts(test_mod, texts = data$documents[1:1633],  n = 50, topics = 1)
thoughts1

#translate and plot topic words and two exemplar texts: 
sentence_vec_topic1 = c("Topic words: closing, employment, investment, capital flight, investment, loss, right, reduce, crime",  "Exemplar 1: There can be many risks, and it unleashes consequences like kidnapping, lack of liquidity, worse services than we already have, and hunger entering the population", "Exemplar 2: Everything is stretched to the point of breaking, and that is about to happen to businesses! Fear, extortion, kidnapping from the city in the trunks of cars, and greedy politicians, and criminals that continuously more organized and colluded with government. It's already a [descaro y secreto a gritos públicos]; where THEY SAY NOTHING. The abuse on the entire working class has no limits; we're the ones who pay for IT ALL. Robberies, assaults, deaths, and nothing happens. Abuses by the governemnt in Mexico City, abuses by the delegations and nothing happens. The conseqence is that we are running out of those who want to work, because those who remain only want easy and fast money. It costs us a lot and they want it ALL without measure.")
dev.new(width = 5, height = 5)
plotQuote(sentence_vec_topic1, width = 60)
dev.off()

#plot differences in word choice based on violence level:
dev.new(width = 5, height = 5)
plot(test_mod, n = 12, type = "perspectives", topics = 1, plabels = c("Low-Vio", "High-Vio"), text.cex = .5)
dev.off()


#### Figure 8: Respondents representing small and large firms discuss concerns differently

# select model:
#sresult10__workforce_cont <- selectModel(docs,vocab,10, prevalence=~GROUP,data=meta, content = ~ employ_cat, runs=200)

# load results from line above:
load("stm_search_results_K10_workforce_cont_200runs_021519.RData")

# subjectively select a coherent model: 
test_mod<- sresult10__workforce_cont$runout[[8]]

# get topic words:
labelTopics(test_mod, n=10)

# get representative texts:
thoughts <- findThoughts(test_mod, texts = data$documents[1:1633],  n = 50, topics = 2)
thoughts

#translate and plot topic words and two exemplar texts: 
sentence_vec_topic1 = c("Topic words: complicate, climate, country, family, environment, inequality, violence, insecurity, investment, poverty",  "Exemplar 1: Besides the risks of physical insecurity and the effects on quality of life, [violence causes] business instability, diminished opportunities for doing business in the state, and almost nobody wants to invest in Michoacan", "Exemplar 2: [Violence causes] a lack of certainty, distrust in personal interactions, the growing desertion of labor, extortion payments, and lower sales.")
dev.new(width = 5, height = 5)
plotQuote(sentence_vec_topic1, width = 60)
dev.off()

#plot differences in word choice based on workforce size:
dev.new(width = 5, height = 5)
plot(test_mod, n = 20, type = "perspectives", topics = 2, plabels = c("Large Firm", "Small Firm"), text.cex = .5)
dev.off()
